Project.1: Titanic Machine Learning from Disaster

필요한 패키지와 라이브러리 다운로드

#install.packages('plotly')
#install.packages('ggthemes')
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## 다음의 패키지를 부착합니다: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(data.table)
## 
## 다음의 패키지를 부착합니다: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)
library(ggthemes)
library(dplyr)

CSV를 데이터프레임에 가져오기

test <- read.csv('test.csv', na.strings = c(''))
train <- read.csv('train.csv', na.strings = c(''))

결측치 및 이상치 확인

colSums(is.na(test))
## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0          86           0 
##       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           1         327           0
colSums(is.na(train))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

Feature Engineering

Family Size라는 새로운 Column 추가

test$FamilySize <- test$SibSp + test$Parch + 1 #형제+부모+자신
train$FamilySize <- train$SibSp + train$Parch + 1

Fare의 결측치는 평균으로 대체

test$Fare[is.na(test$Fare)] <- mean(test$Fare, na.rm = TRUE)

Embarked의 결측치는 다수값으로 대체

table(train$Embarked) #S가 가장 많다
## 
##   C   Q   S 
## 168  77 644
train$Embarked[is.na(train$Embarked)] <- 'S'

Age평균 -> 이름에서 surname별로 구하기

  1. surname 추출 및 변환
train$SName <- gsub('(.*, )|(\\..*)','',train$Name)
table(train$Sex, train$SName)
##         
##          Capt Col Don  Dr Jonkheer Lady Major Master Miss Mlle Mme  Mr Mrs  Ms
##   female    0   0   0   1        0    1     0      0  182    2   1   0 125   1
##   male      1   2   1   6        1    0     2     40    0    0   0 517   0   0
##         
##          Rev Sir the Countess
##   female   0   0            1
##   male     6   1            0
ect <- c('Capt','Col','Don','Dr','Jonkheer','the Countess',
         'Lady','Major','Mlle','Mme','Rev','Sir','Dona')
train$SName[train$SName %in% ect] <- 'Others'
table(train$Sex, train$SName)
##         
##          Master Miss  Mr Mrs  Ms Others
##   female      0  182   0 125   1      6
##   male       40    0 517   0   0     20

  1. surname별 나이 평균
m_M <- mean(train$Age[train$SName=='Master'], na.rm = TRUE)
m_Mr <- mean(train$Age[train$SName=='Mr'], na.rm = TRUE)
m_Mrs <- mean(train$Age[train$SName=='Mrs'], na.rm = TRUE)
m_Mis <- mean(train$Age[train$SName=='Miss'], na.rm = TRUE)
m_Ms <- mean(train$Age[train$SName=='Ms'], na.rm = TRUE)
m_O <- mean(train$Age[train$SName=='Others'], na.rm = TRUE)

  1. 위에서 구한 평균으로 NaN 채우기
train$Age <- ifelse(train$SName=='Master', ifelse(is.na(train$Age), m_M, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Mr', ifelse(is.na(train$Age), m_Mr, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Mrs', ifelse(is.na(train$Age), m_Mrs, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Miss', ifelse(is.na(train$Age), m_Mis, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Ms', ifelse(is.na(train$Age), m_Ms, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Others', ifelse(is.na(train$Age), m_O, train$Age), train$Age)

  1. testset도 동일하게
test$SName <- gsub('(.*, )|(\\..*)','',test$Name)
test$SName[test$SName %in% ect] <- 'Others'

M <- mean(test$Age[test$SName=='Master'], na.rm = TRUE)
Mr <- mean(test$Age[test$SName=='Mr'], na.rm = TRUE)
Mrs <- mean(test$Age[test$SName=='Mrs'], na.rm = TRUE)
Mis <- mean(test$Age[test$SName=='Miss'], na.rm = TRUE)
Ms <- mean(test$Age[test$SName=='Ms'], na.rm = TRUE)
O <- mean(test$Age[test$SName=='Others'], na.rm = TRUE)

test$Age <- ifelse(test$SName=='Master', ifelse(is.na(test$Age), M, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Mr', ifelse(is.na(test$Age), Mr, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Mrs', ifelse(is.na(test$Age), Mrs, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Miss', ifelse(is.na(test$Age), Mis, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Ms', ifelse(is.na(test$Age), Ms, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Others', ifelse(is.na(test$Age), O, test$Age), test$Age)

필요없는 열 제거

train <- subset(train, select=-c(Name, Cabin, Ticket))
test <- subset(test, select=-c(Name, Cabin, Ticket))

각 데이터 셋의 라벨 변환

# train dataset
train$FamilySize = cut(train$FamilySize, c(0, 1, 4, 15), 
                       include.lowest = TRUE)
levels(train$FamilySize) = c("single", "small family(2~4)", "big family(5+)")
levels(train$Survived) = c('Not Survived','Survived')

# test dataset
test$FamilySize = cut(test$FamilySize, c(0, 1, 4, 15), 
                      include.lowest = TRUE)
levels(test$FamilySize) = c("single", "small family(2~4)", "big family(5+)")

타입변환 : 범주형데이터(factor)

# train dataset
train <- train %>%
  dplyr::mutate(Survived = factor(Survived),
                Pclass   = factor(Pclass,ordered = TRUE),
                SName    = factor(SName),
                Sex      = factor(Sex),
                FamilySize   = factor(FamilySize),
                Embarked = factor(Embarked))

# test dataset
test <- test %>%
  dplyr::mutate(Pclass   = factor(Pclass,ordered = TRUE),
                SName    = factor(SName),
                Sex      = factor(Sex),
                FamilySize   = factor(FamilySize),
                Embarked = factor(Embarked))

##EDA
#1 Fare
p1 <- plot_ly(type='box',
              data=train, 
              x=~Survived, 
              y=~Fare,
              color=~Survived,
              alpha = 0.3
              )%>%
  layout(title='Survivor by Fare')
p1
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#group_by(train$Embarked)
#2 Embarked
p2 <- train %>%
  ggplot(aes(Embarked,fill = Survived)) +
  geom_bar(stat='count',position = "fill", alpha=0.85) + 
  scale_fill_brewer(palette = "Paired") + 
  theme_minimal()+
  #scale_y_continuous(labels = percent) +
  labs(x = "Embarked", y = "Survival Rate",
       title = "Survival by Embarked",
       fill = 'Survival')

p2

#3 family size
p3 <- train %>% 
  ggplot(aes(FamilySize,fill = Survived)) +
  geom_bar(stat='count',position = "fill", alpha=0.85) + 
  scale_fill_brewer(palette = "Paired") + 
  theme_minimal()+
  #scale_y_continuous(labels = percent) +
  labs(x = "FamilySize", y = "Survival Rate",
       title = "Survival by FamilySize",
       fill = 'Survival')

p3

#4 Age, Sex
p4 <- train %>%
  filter(Survived=='1') %>% #생존자들 분포
  ggplot(aes(x=Age, fill=Sex)) +
  geom_density(alpha=0.4)+
  theme_minimal()+
  labs(title = "Survival Distribution")
p4

p5 <- train %>%
  filter(Survived=='0') %>% #사망자들 분포
  ggplot(aes(x=Age, fill=Sex)) +
  geom_density(alpha=0.4)+
  theme_minimal()+
  labs(title = "Non-Survival Distribution")
p5

###로지스틱선형 회귀

#모델
levels(train$Survived) = c(0,1) #복구,, 필요한가?
#View(train)
model <- glm(Survived ~., family = binomial(link=logit), data=train)
model
## 
## Call:  glm(formula = Survived ~ ., family = binomial(link = logit), 
##     data = train)
## 
## Coefficients:
##                 (Intercept)                  PassengerId  
##                   1.892e+01                    2.043e-05  
##                    Pclass.L                     Pclass.Q  
##                  -1.472e+00                    7.241e-02  
##                     Sexmale                          Age  
##                  -1.586e+01                   -2.579e-02  
##                       SibSp                        Parch  
##                  -7.310e-02                    8.344e-02  
##                        Fare                    EmbarkedQ  
##                   3.922e-03                    3.175e-02  
##                   EmbarkedS  FamilySizesmall family(2~4)  
##                  -3.065e-01                   -3.399e-01  
##    FamilySizebig family(5+)                    SNameMiss  
##                  -3.048e+00                   -1.651e+01  
##                     SNameMr                     SNameMrs  
##                  -3.515e+00                   -1.578e+01  
##                     SNameMs                  SNameOthers  
##                  -2.331e+00                   -3.544e+00  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  873 Residual
## Null Deviance:       1187 
## Residual Deviance: 710.7     AIC: 746.7
summary(model)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = logit), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7021  -0.5322  -0.3847   0.5505   2.4268  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  1.892e+01  5.804e+02   0.033  0.97399    
## PassengerId                  2.043e-05  3.739e-04   0.055  0.95643    
## Pclass.L                    -1.472e+00  2.296e-01  -6.411 1.45e-10 ***
## Pclass.Q                     7.241e-02  2.026e-01   0.357  0.72076    
## Sexmale                     -1.586e+01  5.804e+02  -0.027  0.97820    
## Age                         -2.579e-02  9.757e-03  -2.643  0.00822 ** 
## SibSp                       -7.310e-02  2.141e-01  -0.341  0.73279    
## Parch                        8.344e-02  2.182e-01   0.382  0.70219    
## Fare                         3.922e-03  2.678e-03   1.465  0.14297    
## EmbarkedQ                    3.175e-02  4.031e-01   0.079  0.93722    
## EmbarkedS                   -3.065e-01  2.573e-01  -1.191  0.23364    
## FamilySizesmall family(2~4) -3.399e-01  3.747e-01  -0.907  0.36428    
## FamilySizebig family(5+)    -3.048e+00  1.103e+00  -2.762  0.00575 ** 
## SNameMiss                   -1.651e+01  5.804e+02  -0.028  0.97730    
## SNameMr                     -3.515e+00  6.026e-01  -5.832 5.47e-09 ***
## SNameMrs                    -1.578e+01  5.804e+02  -0.027  0.97831    
## SNameMs                     -2.331e+00  1.567e+03  -0.001  0.99881    
## SNameOthers                 -3.544e+00  8.302e-01  -4.270 1.96e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  710.73  on 873  degrees of freedom
## AIC: 746.73
## 
## Number of Fisher Scoring iterations: 14
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          890    1186.66              
## PassengerId  1    0.022       889    1186.63    0.8812    
## Pclass       2  103.808       887    1082.82 < 2.2e-16 ***
## Sex          1  256.002       886     826.82 < 2.2e-16 ***
## Age          1   24.813       885     802.01 6.318e-07 ***
## SibSp        1   16.414       884     785.60 5.091e-05 ***
## Parch        1    0.496       883     785.10    0.4812    
## Fare         1    1.529       882     783.57    0.2163    
## Embarked     2    3.775       880     779.80    0.1514    
## FamilySize   2   20.143       878     759.65 4.228e-05 ***
## SName        5   48.923       873     710.73 2.301e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 예측
result <- predict(model,newdata=test,type='response')
result <- ifelse(result > 0.5, 1, 0)
result
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   0   1   0   0   1   0   1   0   1   0   0   0   1   0   1   1   0   0   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   0   1   1   1   1   0   1   0   0   0   0   0   1   1   0   0   1   1   0   0 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   0   0   0   1   1   0   0   0   1   1   0   0   1   1   0   0   0   0   0   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   0   0   0   1   1   1   1   0   0   1   1   0   1   1   1   1   0   1   0   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   0   0   0   0   0   1   1  NA   1   1   0   1   0   1   0   1   0   1   0 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   0   0   0   1   0   0   0   0   0   0   1   1   1   1   0   0   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   0   1   0   0   1   0   1   0   0   0   0   0   0   0   0   0   0   1   0 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   0   1   0   0   0   0   0   0   0   0   1   0   0   1   0   0   1   1   0   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   0   0   1   0   0   1   1   0   0   0   0   0   1   1   0   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   0   0   1   0   1   0   1   0   0   0   0   0   1   0   1   0   1   1   0   1 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 
##   1   1   0   1   0   0   1   0   1   0   0   0   0   1   0   0   1   0   1   0 
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##   1   0   1   0   1   1   0   1   0   0   0   1   0   0   0   0   0   0   1   1 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 
##   1   1   0   0   1   0   1   0   1   1   1   0   1   0   0   0   0   0   1   0 
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 
##   0   0   1   1   0   0   0   0   1   0   0   0   1   1   0   1   0   0   0   0 
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 
##   1   1   1   1   1   0   0   0   0   0   0   1   0   0   0   0   1   0   0   0 
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 
##   0   0   0   0   1   1   0   1   0   1   0   0   0   1   1   1   0   0   0   0 
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 
##   0   0   0   0   1   0   1   0   0   0   1   0   0   1   0   0   0   0   0   1 
## 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   0   0   0   1   1   1   0   1   0   1   1   0   0   0   1   0   1   0   0   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 
##   0   1   1   0   1   0   0   1   1   0   0   1   0   0   1   1   1   0   0   0 
## 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 
##   0   0   1   1   0   1   0   0   0   0   1   1   1   0   0   1   0   1   0   0 
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 
##   1   0   1   1   0   0   0   0   1   1   1   1   1   0   1   0   0   1